# Data in

# Waste Data
data.waste = read_csv('../data/ww_data_all.csv')


# Case Data
data.cases = read_csv('../data/cases_site.csv')


# Site Data
# https://stackoverflow.com/questions/1253499/simple-calculations-for-working-with-lat-lon-and-km-distance#:~:text=The%20approximate%20conversions%20are%3A,111.320*cos(latitude)%20km
minutes_to_km = 110.574
consider_dist = 80

location_coords = tibble(
  SampleLocation = c('Auckland', 'Wellington', 'Christchurch'),
  Latitude = -c(36.8508, 41.2842, 43.5299),
  Longitude = c(174.7645, 174.7775, 172.6333)
)
data.sites = read_csv('../data/sites.csv') %>% 
  mutate(dist_akld = sqrt((Latitude - location_coords[[1,2]])^2 + (Longitude - location_coords[[1,3]])^2) * minutes_to_km,
         dist_wgtn = sqrt((Latitude - location_coords[[2,2]])^2 + (Longitude - location_coords[[2,3]])^2) * minutes_to_km,
         dist_chch = sqrt((Latitude - location_coords[[3,2]])^2 + (Longitude - location_coords[[3,3]])^2) * minutes_to_km,
         consider = dist_akld < consider_dist | dist_wgtn < consider_dist | dist_chch < consider_dist) 


# Rain Data
data.weather = read_csv('../weather_data/weather_data.csv')
data.rain.sites = read_csv('../weather_data/rain_station_data_final.csv')


# Flow Data

data.flow = read_csv('../weather_data/flow_data.csv')
# Last 2 columns aren't interesting
data.flow = data.flow[,-c(4:5)]
data.flow
data.flow = read_csv('../weather_data/2 - DAILY FLOWS SEPT 2022.csv')

data.flow2 = data.flow %>% 
  mutate(DATECOLLECTED = dmy(DATECOLLECTED))

# Someone has put the bloody date in mdy format
data.flow[which(is.na(data.flow2$DATECOLLECTED)),]
data.flow2 = data.flow %>% 
  mutate(DATECOLLECTED1 = dmy(DATECOLLECTED),
         DATECOLLECTED2 = mdy(DATECOLLECTED))
## Warning: 978 failed to parse.
data.flow3 = data.flow2 %>% 
  mutate(DATECOLLECTED = case_when(is.na(DATECOLLECTED1) ~ DATECOLLECTED2, 
                                   TRUE ~ DATECOLLECTED1))


data.flow3[which(is.na(data.flow2$DATECOLLECTED)),]
# Love to see it.

data.flow = data.flow3 %>% select(DATECOLLECTED, SAMPLELOCATION = SAMPLELOCATIONSMASTER, daily_flow_rate_m3)
# write_csv(data.flow, '../weather_data/flow_data_clean.csv')

Setup the data.

# Setup Rain data to explain Flow
data.weather.whole = data.weather %>% 
  left_join(data.rain.sites %>% 
               rename('Station' = Site),  by = 'Station') %>% 
  pivot_longer(cols = 13:15, names_to = 'Del', values_to = 'catchment') %>% 
  select(-Del) %>% 
  filter(!is.na(catchment)) %>% 
  group_by(catchment) %>% 
  group_split()
  
(names(data.weather.whole) = data.weather.whole %>% map(~.$catchment %>% unique() %>% str_replace(' ', ''))) %>% unlist()
##  [1] "AU_ArmyBay"      "AU_Eastern"      "AU_Rosedale"     "AU_SouthWestern"
##  [5] "AU_Warkworth"    "AU_Western"      "CA_Ashburton"    "CA_Christchurch"
##  [9] "CA_Rangiora"     "MA_Blenheim"     "MW_Levin"        "WG_Karori"      
## [13] "WG_Masterton"    "WG_MoaPoint"     "WG_Paraparaumu"  "WG_Porirua"     
## [17] "WG_Seaview"      "WK_Hamilton"     "WK_Thames"
weather_model_prep = function(data){
  data %>% group_by(Date) %>% 
    summarise(avg_rain = mean(Amount.mm.)) %>% 
    mutate(avg_rain.lag1 = lag(avg_rain, 1),
           avg_rain.lag2 = lag(avg_rain, 2),
           avg_rain.lag3 = lag(avg_rain, 3),
           avg_rain.lag4 = lag(avg_rain, 4),
           avg_rain.lag5 = lag(avg_rain, 5),
           avg_rain.lag6 = lag(avg_rain, 6),
           avg_rain.lag7 = lag(avg_rain, 7))
}

# Locations mapped.
data.weather.whole.model = data.weather.whole %>% map(weather_model_prep)

data.waste.whole = data.waste %>% 
  filter(SampleLocation %in% names(data.weather.whole)) %>% 
  arrange(SampleLocation) %>% 
  rename('Date' = Collected) %>% 
  group_by(SampleLocation) %>% 
  group_split()

# Check names match
names(data.waste.whole) = data.waste.whole %>% map(~.$SampleLocation %>% unique()) %>% unlist()

# Get population estimates

# Old population estimates (ESR)
pop_names = map_chr(data.waste.whole, ~.$SampleLocation[1] %>% str_sub(4, -1))
data.population_old = data.sites %>% mutate(catchment = SampleLocation %>% str_sub(4,-1)) %>% filter(catchment %in% pop_names) %>%
  select(catchment, Population) %>%
  arrange(catchment)

# New Estimates (Mackay)
data.population = read_csv('../SitePopulationData.csv')
# These Sites have macron a's and o's that read_csv hasn't recognised. We'll replace these manually. 
data.population[data.population$Site %>% str_detect('\\?'),]$Site = c('Opotiki', 'Taupo', 'Whakatane', 'Whangarei')

data.population = data.population %>% 
  rename('catchment' = Site, 'Population' = `Population Size`) %>% 
  mutate(catchment = catchment %>% str_replace_all(' ', '')) %>% 
  # The Eastern and South Western Interceptors are locations of interest but have 'Mangere' and 'Interceptor' attached. Remove this too
  mutate(catchment = ifelse(catchment %>% str_detect('stern') & Region == 'Auckland', str_extract(catchment, '.*stern') %>% str_sub(8,-1), catchment))

data.population = data.population %>% 
  arrange(catchment)


# datapops joined because Mackay doesnt have all locations of interest. Prefer Mackay's, but will use ESR's if necessary
data.population_joined = data.population %>% 
  rbind(data.population_old %>% 
          cbind(data.frame(Region = rep(NA, nrow(data.population_old)), Island = rep(NA, nrow(data.population_old))))) %>% 
  select(-(2:3)) %>% 
  # ID's have different capitalization
  mutate(catchment_lower = tolower(catchment)) %>% 
  distinct(catchment_lower, .keep_all = TRUE) %>% 
  arrange(catchment) %>% 
  filter(catchment_lower %in% tolower(pop_names)) %>% 
  select(-3)


# Southwestern is causing issues. Just fix the damn name
data.population_joined[data.population_joined$catchment == 'Southwestern',]$catchment = 'SouthWestern'

# Create the final modelling dataset
model.data = data.waste.whole %>% 
  map2(data.weather.whole.model, inner_join, by = c('Date')) %>% 
  map2(data.population_joined$Population, cbind) %>% 
  map(rename, 'population' = `.y[[i]]`) %>% 
  map(mutate, sars_per_person = sars_gcl / population)
names(model.data) = names(pop_names)

model.data$CA_Christchurch

Join with flow.

data.flow.split = data.flow %>%
  rename('Date' = DATECOLLECTED) %>% 
  group_by(SAMPLELOCATION) %>% 
  group_split()

# Flow locations

model.data.flow = model.data[map_chr(data.flow.split, ~.$SAMPLELOCATION[[1]])]

# Join the flow estimates into the rain and waste data.

model.data.flow = map2(model.data.flow, data.flow.split, left_join, by = 'Date')

First I am interested in seeing how much rain explains the daily flow.

model.data.flow_joined = model.data.flow %>% 
  do.call(rbind, .) %>% 
  as_tibble()


# Flow over time
model.data.flow_joined %>% 
  ggplot(aes(x = Date, y = daily_flow_rate_m3)) +
  geom_point() +
  facet_wrap(~SampleLocation, scales = 'free_y')
## Warning: Removed 446 rows containing missing values (`geom_point()`).

Flow seems to vary over time for all locations. No data for AU_Western.

# Rain in line with Flow

normalise <- function(v){
  (v - min(v, na.rm = T)) / (max(v, na.rm = T) - min(v, na.rm = T))
}

# Normalise the columns
model.data.flow_joined %>% 
  mutate(cumulative_rain_7days = avg_rain + avg_rain.lag1 + avg_rain.lag2 + avg_rain.lag3 + avg_rain.lag4 + avg_rain.lag5 + avg_rain.lag6,
         cumulative_rain_7days.normal = normalise(cumulative_rain_7days),
         daily_flow_rate_m3.normal = normalise(daily_flow_rate_m3)) %>% 
  ggplot(aes(x = Date, y = cumulative_rain_7days.normal)) +
  geom_line(alpha = 0.3, col = 'blue') +
  geom_point(aes(y = daily_flow_rate_m3.normal)) +
  facet_wrap(~SampleLocation, scales = 'free_y')
## Warning: Removed 446 rows containing missing values (`geom_point()`).

Rain over the past week seems to be reasonably correlated with flow in those places which have varied flow as previously identified.

model.data.flow_joined %>% 
  mutate(cumulative_rain_7days = avg_rain + avg_rain.lag1 + avg_rain.lag2 + avg_rain.lag3 + avg_rain.lag4 + avg_rain.lag5 + avg_rain.lag6) %>% 
  ggplot(aes(x = cumulative_rain_7days, y = daily_flow_rate_m3)) +
  geom_point(alpha = 0.3) +
  facet_wrap(~SampleLocation, scales = 'free_y') +
  labs(x = 'Total Rain over the past week (ml)')
## Warning: Removed 446 rows containing missing values (`geom_point()`).

In all locations, there seems to be a somewhat strong positive relationship between the amount of rain over the past 7 days and the amount of flow.

Modelling

coef(rain_flow.fit2)[names(coef(rain_flow.fit2)) %>% str_detect('rain')] %>% 
  cbind(rain = names(.), effect_percent = (exp(.) - 1)*100) %>% 
  as_tibble() %>% 
  select(-1) %>% 
  mutate(effect_percent = as.numeric(effect_percent),
         cumulative_percent = cumsum(effect_percent),
         day = 0:7) %>% 
  ggplot(aes(x = day, y = effect_percent)) +
  geom_point(size = 3) +
  geom_smooth(se = F, aes(col = 'Individual Effect')) +
  geom_smooth(aes(y = cumulative_percent, col = 'Cumulative Effect'), se = F) +
  geom_point(aes(y = cumulative_percent), col = alpha('red', 0.7), pch = 5) +
  theme_bw() +
  labs(title = "Effect and Cumulative effect of a week's rain on Wastewater flow.",
       x = "Day's prior", y = '% Increase in flow', colour = '') +
  scale_x_continuous(breaks = 0:7) +
  scale_y_continuous(breaks = seq(0, 5, 0.2)) +
  theme(legend.position = 'top') +
  scale_color_discrete()

Benefit to using Flow over Rain.

source('../produce_modelling_summaries.R')
models = map(model.data.flow, produce_rainsummary.gam)

map(models, print_rainsummary.gam)
## 
## 
## Table: AU_Eastern
## 
## | Concentration Loss %| 97.5%|   2.5%|   P-Value|
## |--------------------:|-----:|------:|---------:|
## |                1.155| 6.155| -4.111| 0.6543066|
## 
## 
## Table: AU_Rosedale
## 
## | Concentration Loss %|  97.5%|  2.5%|   P-Value|
## |--------------------:|------:|-----:|---------:|
## |                6.691| 10.339| 2.895| 0.0005142|
## 
## 
## Table: AU_SouthWestern
## 
## | Concentration Loss %| 97.5%|   2.5%|   P-Value|
## |--------------------:|-----:|------:|---------:|
## |                0.811| 6.799| -5.561| 0.7935892|
## 
## 
## Table: AU_Western
## 
## | Concentration Loss %| 97.5%|   2.5%|   P-Value|
## |--------------------:|-----:|------:|---------:|
## |                2.997| 7.424| -1.641| 0.1925965|
## 
## 
## Table: CA_Christchurch
## 
## | Concentration Loss %|  97.5%|  2.5%|  P-Value|
## |--------------------:|------:|-----:|--------:|
## |               10.134| 15.778| 4.111| 0.000987|
## 
## 
## Table: WG_Karori
## 
## | Concentration Loss %|  97.5%|   2.5%|   P-Value|
## |--------------------:|------:|------:|---------:|
## |                5.273| 11.075| -0.906| 0.0864477|
## 
## 
## Table: WG_MoaPoint
## 
## | Concentration Loss %|  97.5%|  2.5%|   P-Value|
## |--------------------:|------:|-----:|---------:|
## |                8.433| 14.239| 2.234| 0.0071502|
## 
## 
## Table: WG_Porirua
## 
## | Concentration Loss %|  97.5%|  2.5%|   P-Value|
## |--------------------:|------:|-----:|---------:|
## |                7.283| 11.597| 2.758| 0.0015044|
## 
## 
## Table: WG_Seaview
## 
## | Concentration Loss %|  97.5%|  2.5%| P-Value|
## |--------------------:|------:|-----:|-------:|
## |                8.944| 12.566| 5.172| 3.9e-06|
## $AU_Eastern
## 
## $AU_Rosedale
## 
## $AU_SouthWestern
## 
## $AU_Western
## 
## $CA_Christchurch
## 
## $WG_Karori
## 
## $WG_MoaPoint
## 
## $WG_Porirua
## 
## $WG_Seaview

In almost every location, rain proved to explain some of the variance in the observed concentration of COVID-19 bodies.
In AU_Western, this effect is not totally significant and in AU_Eastern rain appears to have also no effect let alone MLE positive effect on the concentration. These are the locations that I expect to see more variance explained when we factor in flow.

With flow in the model.

# Can't map some locations
map_dbl(model.data.flow, ~filter(., !is.na(daily_flow_rate_m3)) %>% nrow())
##      AU_Eastern     AU_Rosedale AU_SouthWestern      AU_Western CA_Christchurch 
##              39              71              38               0              58 
##       WG_Karori     WG_MoaPoint      WG_Porirua      WG_Seaview 
##              47              70              71              71
models = map(model.data.flow[-4], produce_flowrainsummary.gam)

map(models, print_flowrainsummary.gam)
## 
## 
## Table: AU_Eastern
## 
## |Quantile  | Flow Rate| % Concentration Loss Explained|    97.5%|      2.5%|   P-Value|
## |:---------|---------:|------------------------------:|--------:|---------:|---------:|
## |Lower 25% |  1824.969|                       10.48740| 75.97150| -233.4584| 0.8677053|
## |Median    |  2220.906|                       12.61333| 82.36515| -333.0307| 0.8677053|
## |Upper 75% |  3137.688|                       17.34408| 91.38457| -692.9963| 0.8677053|
## 
## 
## Table: AU_Rosedale
## 
## |Quantile  | Flow Rate| % Concentration Loss Explained|    97.5%|      2.5%|   P-Value|
## |:---------|---------:|------------------------------:|--------:|---------:|---------:|
## |Lower 25% |   49409.5|                       47.75869| 76.17781| -14.56355| 0.1041809|
## |Median    |   54654.0|                       51.23783| 79.54252| -16.22883| 0.1041809|
## |Upper 75% |   67712.5|                       58.92698| 85.99788| -20.48121| 0.1041809|
## 
## 
## Table: AU_SouthWestern
## 
## |Quantile  | Flow Rate| % Concentration Loss Explained|    97.5%|      2.5%|  P-Value|
## |:---------|---------:|------------------------------:|--------:|---------:|--------:|
## |Lower 25% |  322.9357|                      -32.74890| 43.03762| -209.3668| 0.509977|
## |Median    |  362.0000|                      -37.37683| 46.78641| -254.6536| 0.509977|
## |Upper 75% |  547.7163|                      -61.68400| 61.49968| -579.0000| 0.509977|
## 
## 
## Table: CA_Christchurch
## 
## |Quantile  | Flow Rate| % Concentration Loss Explained|    97.5%|      2.5%|   P-Value|
## |:---------|---------:|------------------------------:|--------:|---------:|---------:|
## |Lower 25% |  138267.6|                       73.84796| 94.29508| -19.88420| 0.0854828|
## |Median    |  146598.6|                       75.87827| 95.19925| -21.20140| 0.0854828|
## |Upper 75% |  163598.7|                       79.54543| 96.62410| -23.93428| 0.0854828|
## 
## 
## Table: WG_Karori
## 
## |Quantile  | Flow Rate| % Concentration Loss Explained|    97.5%|      2.5%|   P-Value|
## |:---------|---------:|------------------------------:|--------:|---------:|---------:|
## |Lower 25% |  3384.082|                       30.38666| 56.99065| -12.67358| 0.1430274|
## |Median    |  3781.303|                       33.28434| 61.04613| -14.26282| 0.1430274|
## |Upper 75% |  4829.937|                       40.36757| 70.00825| -18.56682| 0.1430274|
## 
## 
## Table: WG_MoaPoint
## 
## |Quantile  | Flow Rate| % Concentration Loss Explained|     97.5%|      2.5%|   P-Value|
## |:---------|---------:|------------------------------:|---------:|---------:|---------:|
## |Lower 25% |  63208.56|                      -154.8860| -30.57718| -397.5363| 0.0072714|
## |Median    |  67393.95|                      -171.1767| -32.90444| -453.3059| 0.0072714|
## |Upper 75% |  83168.15|                      -242.4990| -42.05453| -725.7784| 0.0072714|
## 
## 
## Table: WG_Porirua
## 
## |Quantile  | Flow Rate| % Concentration Loss Explained|    97.5%|       2.5%|   P-Value|
## |:---------|---------:|------------------------------:|--------:|----------:|---------:|
## |Lower 25% |  21285.94|                      -18.59088| 13.55159|  -62.68426| 0.2857408|
## |Median    |  24401.17|                      -21.58746| 15.37448|  -74.69330| 0.2857408|
## |Upper 75% |  35960.33|                      -33.38339| 21.80883| -127.53372| 0.2857408|
## 
## 
## Table: WG_Seaview
## 
## |Quantile  | Flow Rate| % Concentration Loss Explained|    97.5%|       2.5%|   P-Value|
## |:---------|---------:|------------------------------:|--------:|----------:|---------:|
## |Lower 25% |  50866.01|                      -0.931197| 39.33606|  -67.92687| 0.9710945|
## |Median    |  57099.18|                      -1.045902| 42.94013|  -78.93967| 0.9710945|
## |Upper 75% |  76807.93|                      -1.409446| 52.98633| -118.74222| 0.9710945|
## $AU_Eastern
## 
## $AU_Rosedale
## 
## $AU_SouthWestern
## 
## $CA_Christchurch
## 
## $WG_Karori
## 
## $WG_MoaPoint
## 
## $WG_Porirua
## 
## $WG_Seaview

Unless my modelling techniques are incorrect, this is saying that in almost every case it is ideal to have good flow estimates.
So, is flow useful in all locations?

map(models, print_flowsummary.gam)
## 
## 
## Table: AU_Eastern
## 
## |Quantile  | Flow Rate| % Concentration Loss Explained|    97.5%|      2.5%|   P-Value|
## |:---------|---------:|------------------------------:|--------:|---------:|---------:|
## |Lower 25% |  1824.969|                       10.48740| 75.97150| -233.4584| 0.8677053|
## |Median    |  2220.906|                       12.61333| 82.36515| -333.0307| 0.8677053|
## |Upper 75% |  3137.688|                       17.34408| 91.38457| -692.9963| 0.8677053|

## NULL

## NULL
## 
## 
## Table: AU_Rosedale
## 
## |Quantile  | Flow Rate| % Concentration Loss Explained|    97.5%|      2.5%|  P-Value|
## |:---------|---------:|------------------------------:|--------:|---------:|--------:|
## |Lower 25% |  49187.00|                       46.85721| 73.49965| -6.570539| 0.074627|
## |Median    |  54386.50|                       50.29256| 76.97052| -7.289854| 0.074627|
## |Upper 75% |  66220.25|                       57.30599| 83.26884| -8.945124| 0.074627|

## NULL

## NULL
## 
## 
## Table: AU_SouthWestern
## 
## |Quantile  | Flow Rate| % Concentration Loss Explained|    97.5%|      2.5%|  P-Value|
## |:---------|---------:|------------------------------:|--------:|---------:|--------:|
## |Lower 25% |  322.9357|                      -32.74890| 43.03762| -209.3668| 0.509977|
## |Median    |  362.0000|                      -37.37683| 46.78641| -254.6536| 0.509977|
## |Upper 75% |  547.7163|                      -61.68400| 61.49968| -579.0000| 0.509977|

## NULL

## NULL
## 
## 
## Table: CA_Christchurch
## 
## |Quantile  | Flow Rate| % Concentration Loss Explained|    97.5%|      2.5%|   P-Value|
## |:---------|---------:|------------------------------:|--------:|---------:|---------:|
## |Lower 25% |  138367.9|                       75.04558| 93.80432| -0.509136| 0.0518602|
## |Median    |  145670.9|                       76.80848| 94.65021| -0.536080| 0.0518602|
## |Upper 75% |  158901.4|                       79.69114| 95.89949| -0.584911| 0.0518602|

## NULL

## NULL
## 
## 
## Table: WG_Karori
## 
## |Quantile  | Flow Rate| % Concentration Loss Explained|    97.5%|      2.5%|   P-Value|
## |:---------|---------:|------------------------------:|--------:|---------:|---------:|
## |Lower 25% |  3381.513|                       30.50597| 58.32700| -15.88846| 0.1651138|
## |Median    |  3674.000|                       32.65946| 61.36563| -17.37602| 0.1651138|
## |Upper 75% |  4651.875|                       39.38633| 70.00544| -22.48945| 0.1651138|

## NULL

## NULL
## 
## 
## Table: WG_MoaPoint
## 
## |Quantile  | Flow Rate| % Concentration Loss Explained|     97.5%|      2.5%|  P-Value|
## |:---------|---------:|------------------------------:|---------:|---------:|--------:|
## |Lower 25% |  62880.68|                      -151.2889| -24.87733| -405.6649| 0.010886|
## |Median    |  66666.72|                      -165.6242| -26.55895| -457.4968| 0.010886|
## |Upper 75% |  82595.57|                      -235.4586| -33.88562| -740.5121| 0.010886|

## NULL

## NULL
## 
## 
## Table: WG_Porirua
## 
## |Quantile  | Flow Rate| % Concentration Loss Explained|    97.5%|       2.5%|   P-Value|
## |:---------|---------:|------------------------------:|--------:|----------:|---------:|
## |Lower 25% |  21096.88|                      -18.70439| 11.78726|  -59.73580| 0.2528757|
## |Median    |  23933.78|                      -21.47316| 13.26250|  -70.11938| 0.2528757|
## |Upper 75% |  35886.58|                      -33.86616| 19.21202| -121.81703| 0.2528757|

## NULL

## NULL
## 
## 
## Table: WG_Seaview
## 
## |Quantile  | Flow Rate| % Concentration Loss Explained|    97.5%|       2.5%|   P-Value|
## |:---------|---------:|------------------------------:|--------:|----------:|---------:|
## |Lower 25% |  49854.94|                      -8.061532| 31.63517|  -70.80850| 0.7361082|
## |Median    |  55110.34|                      -8.948315| 34.32171|  -80.72539| 0.7361082|
## |Upper 75% |  75664.01|                     -12.486939| 43.85276| -125.35945| 0.7361082|

## NULL

## NULL
## $AU_Eastern
## NULL
## 
## $AU_Rosedale
## NULL
## 
## $AU_SouthWestern
## NULL
## 
## $CA_Christchurch
## NULL
## 
## $WG_Karori
## NULL
## 
## $WG_MoaPoint
## NULL
## 
## $WG_Porirua
## NULL
## 
## $WG_Seaview
## NULL